kreuzberg 3.3.0__py3-none-any.whl → 3.8.1__py3-none-any.whl
This diff represents the content of publicly available package versions that have been released to one of the supported registries. The information contained in this diff is provided for informational purposes only and reflects changes between package versions as they appear in their respective public registries.
- kreuzberg/__init__.py +9 -2
- kreuzberg/_api/__init__.py +0 -0
- kreuzberg/_api/main.py +87 -0
- kreuzberg/_entity_extraction.py +238 -0
- kreuzberg/_extractors/_base.py +39 -1
- kreuzberg/_extractors/_email.py +149 -0
- kreuzberg/_extractors/_html.py +15 -3
- kreuzberg/_extractors/_image.py +27 -22
- kreuzberg/_extractors/_pandoc.py +3 -14
- kreuzberg/_extractors/_pdf.py +97 -34
- kreuzberg/_extractors/_presentation.py +62 -10
- kreuzberg/_extractors/_spread_sheet.py +181 -6
- kreuzberg/_extractors/_structured.py +148 -0
- kreuzberg/_gmft.py +318 -11
- kreuzberg/_language_detection.py +95 -0
- kreuzberg/_mcp/__init__.py +5 -0
- kreuzberg/_mcp/server.py +227 -0
- kreuzberg/_mime_types.py +27 -1
- kreuzberg/_ocr/__init__.py +10 -1
- kreuzberg/_ocr/_base.py +59 -0
- kreuzberg/_ocr/_easyocr.py +92 -1
- kreuzberg/_ocr/_paddleocr.py +89 -0
- kreuzberg/_ocr/_tesseract.py +569 -5
- kreuzberg/_registry.py +4 -0
- kreuzberg/_types.py +181 -4
- kreuzberg/_utils/_cache.py +52 -4
- kreuzberg/_utils/_device.py +2 -2
- kreuzberg/_utils/_errors.py +3 -7
- kreuzberg/_utils/_process_pool.py +182 -9
- kreuzberg/_utils/_quality.py +237 -0
- kreuzberg/_utils/_serialization.py +4 -2
- kreuzberg/_utils/_string.py +153 -10
- kreuzberg/_utils/_sync.py +6 -7
- kreuzberg/_utils/_table.py +261 -0
- kreuzberg/_utils/_tmp.py +2 -2
- kreuzberg/cli.py +1 -2
- kreuzberg/extraction.py +43 -34
- kreuzberg-3.8.1.dist-info/METADATA +301 -0
- kreuzberg-3.8.1.dist-info/RECORD +53 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/entry_points.txt +1 -0
- kreuzberg/_multiprocessing/__init__.py +0 -6
- kreuzberg/_multiprocessing/gmft_isolated.py +0 -332
- kreuzberg/_multiprocessing/process_manager.py +0 -188
- kreuzberg/_multiprocessing/sync_tesseract.py +0 -261
- kreuzberg/_multiprocessing/tesseract_pool.py +0 -359
- kreuzberg-3.3.0.dist-info/METADATA +0 -235
- kreuzberg-3.3.0.dist-info/RECORD +0 -48
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/WHEEL +0 -0
- {kreuzberg-3.3.0.dist-info → kreuzberg-3.8.1.dist-info}/licenses/LICENSE +0 -0
kreuzberg/__init__.py
CHANGED
@@ -1,11 +1,15 @@
|
|
1
|
+
from importlib.metadata import version
|
2
|
+
|
3
|
+
from kreuzberg._entity_extraction import SpacyEntityExtractionConfig
|
1
4
|
from kreuzberg._gmft import GMFTConfig
|
5
|
+
from kreuzberg._language_detection import LanguageDetectionConfig
|
2
6
|
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
3
7
|
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
4
8
|
from kreuzberg._ocr._tesseract import TesseractConfig
|
5
9
|
|
6
10
|
from ._ocr._tesseract import PSMMode
|
7
11
|
from ._registry import ExtractorRegistry
|
8
|
-
from ._types import ExtractionConfig, ExtractionResult, Metadata, TableData
|
12
|
+
from ._types import Entity, ExtractionConfig, ExtractionResult, Metadata, TableData
|
9
13
|
from .exceptions import KreuzbergError, MissingDependencyError, OCRError, ParsingError, ValidationError
|
10
14
|
from .extraction import (
|
11
15
|
batch_extract_bytes,
|
@@ -18,21 +22,24 @@ from .extraction import (
|
|
18
22
|
extract_file_sync,
|
19
23
|
)
|
20
24
|
|
21
|
-
__version__ = "
|
25
|
+
__version__ = version("kreuzberg")
|
22
26
|
|
23
27
|
__all__ = [
|
24
28
|
"EasyOCRConfig",
|
29
|
+
"Entity",
|
25
30
|
"ExtractionConfig",
|
26
31
|
"ExtractionResult",
|
27
32
|
"ExtractorRegistry",
|
28
33
|
"GMFTConfig",
|
29
34
|
"KreuzbergError",
|
35
|
+
"LanguageDetectionConfig",
|
30
36
|
"Metadata",
|
31
37
|
"MissingDependencyError",
|
32
38
|
"OCRError",
|
33
39
|
"PSMMode",
|
34
40
|
"PaddleOCRConfig",
|
35
41
|
"ParsingError",
|
42
|
+
"SpacyEntityExtractionConfig",
|
36
43
|
"TableData",
|
37
44
|
"TesseractConfig",
|
38
45
|
"ValidationError",
|
File without changes
|
kreuzberg/_api/main.py
ADDED
@@ -0,0 +1,87 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
from json import dumps
|
4
|
+
from typing import TYPE_CHECKING, Annotated, Any
|
5
|
+
|
6
|
+
from kreuzberg import (
|
7
|
+
ExtractionResult,
|
8
|
+
KreuzbergError,
|
9
|
+
MissingDependencyError,
|
10
|
+
ParsingError,
|
11
|
+
ValidationError,
|
12
|
+
batch_extract_bytes,
|
13
|
+
)
|
14
|
+
|
15
|
+
if TYPE_CHECKING:
|
16
|
+
from litestar.datastructures import UploadFile
|
17
|
+
|
18
|
+
try:
|
19
|
+
from litestar import Litestar, Request, Response, get, post
|
20
|
+
from litestar.contrib.opentelemetry import OpenTelemetryConfig, OpenTelemetryPlugin
|
21
|
+
from litestar.enums import RequestEncodingType
|
22
|
+
from litestar.logging import StructLoggingConfig
|
23
|
+
from litestar.params import Body
|
24
|
+
from litestar.status_codes import (
|
25
|
+
HTTP_400_BAD_REQUEST,
|
26
|
+
HTTP_422_UNPROCESSABLE_ENTITY,
|
27
|
+
HTTP_500_INTERNAL_SERVER_ERROR,
|
28
|
+
)
|
29
|
+
except ImportError as e:
|
30
|
+
raise MissingDependencyError.create_for_package(
|
31
|
+
dependency_group="litestar",
|
32
|
+
functionality="Litestar API and docker container",
|
33
|
+
package_name="litestar",
|
34
|
+
) from e
|
35
|
+
|
36
|
+
|
37
|
+
def exception_handler(request: Request[Any, Any, Any], exception: KreuzbergError) -> Response[Any]:
|
38
|
+
if isinstance(exception, ValidationError):
|
39
|
+
status_code = HTTP_400_BAD_REQUEST
|
40
|
+
elif isinstance(exception, ParsingError):
|
41
|
+
status_code = HTTP_422_UNPROCESSABLE_ENTITY
|
42
|
+
else:
|
43
|
+
status_code = HTTP_500_INTERNAL_SERVER_ERROR
|
44
|
+
|
45
|
+
message = str(exception)
|
46
|
+
details = dumps(exception.context)
|
47
|
+
|
48
|
+
if request.app.logger:
|
49
|
+
request.app.logger.error(
|
50
|
+
"API error",
|
51
|
+
method=request.method,
|
52
|
+
url=str(request.url),
|
53
|
+
status_code=status_code,
|
54
|
+
message=message,
|
55
|
+
context=exception.context,
|
56
|
+
)
|
57
|
+
|
58
|
+
return Response(
|
59
|
+
content={"message": message, "details": details},
|
60
|
+
status_code=status_code,
|
61
|
+
)
|
62
|
+
|
63
|
+
|
64
|
+
@post("/extract", operation_id="ExtractFiles")
|
65
|
+
async def handle_files_upload(
|
66
|
+
data: Annotated[list[UploadFile], Body(media_type=RequestEncodingType.MULTI_PART)],
|
67
|
+
) -> list[ExtractionResult]:
|
68
|
+
"""Extracts text content from an uploaded file."""
|
69
|
+
return await batch_extract_bytes(
|
70
|
+
[(await file.read(), file.content_type) for file in data],
|
71
|
+
)
|
72
|
+
|
73
|
+
|
74
|
+
@get("/health", operation_id="HealthCheck")
|
75
|
+
async def health_check() -> dict[str, str]:
|
76
|
+
"""A simple health check endpoint."""
|
77
|
+
return {"status": "ok"}
|
78
|
+
|
79
|
+
|
80
|
+
app = Litestar(
|
81
|
+
route_handlers=[handle_files_upload, health_check],
|
82
|
+
plugins=[OpenTelemetryPlugin(OpenTelemetryConfig())],
|
83
|
+
logging_config=StructLoggingConfig(),
|
84
|
+
exception_handlers={
|
85
|
+
KreuzbergError: exception_handler,
|
86
|
+
},
|
87
|
+
)
|
@@ -0,0 +1,238 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import os
|
4
|
+
import re
|
5
|
+
from dataclasses import dataclass
|
6
|
+
from functools import lru_cache
|
7
|
+
from typing import TYPE_CHECKING, Any
|
8
|
+
|
9
|
+
from kreuzberg._types import Entity
|
10
|
+
from kreuzberg.exceptions import MissingDependencyError
|
11
|
+
|
12
|
+
if TYPE_CHECKING:
|
13
|
+
from collections.abc import Sequence
|
14
|
+
from pathlib import Path
|
15
|
+
|
16
|
+
|
17
|
+
@dataclass(unsafe_hash=True, frozen=True)
|
18
|
+
class SpacyEntityExtractionConfig:
|
19
|
+
"""Configuration for spaCy-based entity extraction."""
|
20
|
+
|
21
|
+
model_cache_dir: str | Path | None = None
|
22
|
+
"""Directory to cache spaCy models. If None, uses spaCy's default."""
|
23
|
+
|
24
|
+
language_models: dict[str, str] | tuple[tuple[str, str], ...] | None = None
|
25
|
+
"""Mapping of language codes to spaCy model names.
|
26
|
+
|
27
|
+
If None, uses default mappings:
|
28
|
+
- en: en_core_web_sm
|
29
|
+
- de: de_core_news_sm
|
30
|
+
- fr: fr_core_news_sm
|
31
|
+
- es: es_core_news_sm
|
32
|
+
- pt: pt_core_news_sm
|
33
|
+
- it: it_core_news_sm
|
34
|
+
- nl: nl_core_news_sm
|
35
|
+
- zh: zh_core_web_sm
|
36
|
+
- ja: ja_core_news_sm
|
37
|
+
"""
|
38
|
+
|
39
|
+
fallback_to_multilingual: bool = True
|
40
|
+
"""If True and language-specific model fails, try xx_ent_wiki_sm (multilingual)."""
|
41
|
+
|
42
|
+
max_doc_length: int = 1000000
|
43
|
+
"""Maximum document length for spaCy processing."""
|
44
|
+
|
45
|
+
batch_size: int = 1000
|
46
|
+
"""Batch size for processing multiple texts."""
|
47
|
+
|
48
|
+
def __post_init__(self) -> None:
|
49
|
+
if self.language_models is None:
|
50
|
+
object.__setattr__(self, "language_models", self._get_default_language_models())
|
51
|
+
|
52
|
+
if isinstance(self.language_models, dict):
|
53
|
+
object.__setattr__(self, "language_models", tuple(sorted(self.language_models.items())))
|
54
|
+
|
55
|
+
@staticmethod
|
56
|
+
def _get_default_language_models() -> dict[str, str]:
|
57
|
+
"""Get default language model mappings based on available spaCy models."""
|
58
|
+
return {
|
59
|
+
"en": "en_core_web_sm",
|
60
|
+
"de": "de_core_news_sm",
|
61
|
+
"fr": "fr_core_news_sm",
|
62
|
+
"es": "es_core_news_sm",
|
63
|
+
"pt": "pt_core_news_sm",
|
64
|
+
"it": "it_core_news_sm",
|
65
|
+
"nl": "nl_core_news_sm",
|
66
|
+
"zh": "zh_core_web_sm",
|
67
|
+
"ja": "ja_core_news_sm",
|
68
|
+
"ko": "ko_core_news_sm",
|
69
|
+
"ru": "ru_core_news_sm",
|
70
|
+
"pl": "pl_core_news_sm",
|
71
|
+
"ro": "ro_core_news_sm",
|
72
|
+
"el": "el_core_news_sm",
|
73
|
+
"da": "da_core_news_sm",
|
74
|
+
"fi": "fi_core_news_sm",
|
75
|
+
"nb": "nb_core_news_sm",
|
76
|
+
"sv": "sv_core_news_sm",
|
77
|
+
"ca": "ca_core_news_sm",
|
78
|
+
"hr": "hr_core_news_sm",
|
79
|
+
"lt": "lt_core_news_sm",
|
80
|
+
"mk": "mk_core_news_sm",
|
81
|
+
"sl": "sl_core_news_sm",
|
82
|
+
"uk": "uk_core_news_sm",
|
83
|
+
}
|
84
|
+
|
85
|
+
def get_model_for_language(self, language_code: str) -> str | None:
|
86
|
+
"""Get the appropriate spaCy model for a language code."""
|
87
|
+
if not self.language_models:
|
88
|
+
return None
|
89
|
+
|
90
|
+
models_dict = dict(self.language_models) if isinstance(self.language_models, tuple) else self.language_models
|
91
|
+
|
92
|
+
if language_code in models_dict:
|
93
|
+
return models_dict[language_code]
|
94
|
+
|
95
|
+
base_lang = language_code.split("-")[0].lower()
|
96
|
+
if base_lang in models_dict:
|
97
|
+
return models_dict[base_lang]
|
98
|
+
|
99
|
+
return None
|
100
|
+
|
101
|
+
def get_fallback_model(self) -> str | None:
|
102
|
+
"""Get fallback multilingual model if enabled."""
|
103
|
+
return "xx_ent_wiki_sm" if self.fallback_to_multilingual else None
|
104
|
+
|
105
|
+
|
106
|
+
def extract_entities(
|
107
|
+
text: str,
|
108
|
+
entity_types: Sequence[str] = ("PERSON", "ORGANIZATION", "LOCATION", "DATE", "EMAIL", "PHONE"),
|
109
|
+
custom_patterns: frozenset[tuple[str, str]] | None = None,
|
110
|
+
languages: list[str] | None = None,
|
111
|
+
spacy_config: SpacyEntityExtractionConfig | None = None,
|
112
|
+
) -> list[Entity]:
|
113
|
+
"""Extract entities from text using custom regex patterns and/or a NER model.
|
114
|
+
|
115
|
+
Args:
|
116
|
+
text: The input text to extract entities from.
|
117
|
+
entity_types: List of entity types to extract using the NER model.
|
118
|
+
custom_patterns: Tuple mapping entity types to regex patterns for custom extraction.
|
119
|
+
languages: List of detected languages to choose appropriate spaCy models.
|
120
|
+
spacy_config: Configuration for spaCy entity extraction.
|
121
|
+
|
122
|
+
Returns:
|
123
|
+
list[Entity]: A list of extracted Entity objects with type, text, start, and end positions.
|
124
|
+
|
125
|
+
Raises:
|
126
|
+
MissingDependencyError: If `spacy` is not installed.
|
127
|
+
"""
|
128
|
+
entities: list[Entity] = []
|
129
|
+
if custom_patterns:
|
130
|
+
custom_patterns_dict = dict(custom_patterns)
|
131
|
+
for ent_type, pattern in custom_patterns_dict.items():
|
132
|
+
entities.extend(
|
133
|
+
Entity(type=ent_type, text=match.group(), start=match.start(), end=match.end())
|
134
|
+
for match in re.finditer(pattern, text)
|
135
|
+
)
|
136
|
+
|
137
|
+
if spacy_config is None:
|
138
|
+
spacy_config = SpacyEntityExtractionConfig()
|
139
|
+
|
140
|
+
try:
|
141
|
+
import spacy # noqa: F401
|
142
|
+
except ImportError as e:
|
143
|
+
raise MissingDependencyError.create_for_package(
|
144
|
+
package_name="spacy",
|
145
|
+
dependency_group="entity-extraction",
|
146
|
+
functionality="Entity Extraction",
|
147
|
+
) from e
|
148
|
+
|
149
|
+
model_name = _select_spacy_model(languages, spacy_config)
|
150
|
+
if not model_name:
|
151
|
+
return entities
|
152
|
+
|
153
|
+
nlp = _load_spacy_model(model_name, spacy_config)
|
154
|
+
if not nlp:
|
155
|
+
return entities
|
156
|
+
|
157
|
+
if len(text) > spacy_config.max_doc_length:
|
158
|
+
text = text[: spacy_config.max_doc_length]
|
159
|
+
|
160
|
+
doc = nlp(text)
|
161
|
+
|
162
|
+
entity_type_mapping = {etype.upper() for etype in entity_types}
|
163
|
+
|
164
|
+
entities.extend(
|
165
|
+
Entity(
|
166
|
+
type=ent.label_,
|
167
|
+
text=ent.text,
|
168
|
+
start=ent.start_char,
|
169
|
+
end=ent.end_char,
|
170
|
+
)
|
171
|
+
for ent in doc.ents
|
172
|
+
if ent.label_ in entity_type_mapping or ent.label_.upper() in entity_type_mapping
|
173
|
+
)
|
174
|
+
|
175
|
+
return entities
|
176
|
+
|
177
|
+
|
178
|
+
@lru_cache(maxsize=32)
|
179
|
+
def _load_spacy_model(model_name: str, spacy_config: SpacyEntityExtractionConfig) -> Any:
|
180
|
+
"""Load a spaCy model with caching."""
|
181
|
+
try:
|
182
|
+
import spacy
|
183
|
+
|
184
|
+
if spacy_config.model_cache_dir:
|
185
|
+
os.environ["SPACY_DATA"] = str(spacy_config.model_cache_dir)
|
186
|
+
|
187
|
+
nlp = spacy.load(model_name)
|
188
|
+
|
189
|
+
nlp.max_length = spacy_config.max_doc_length
|
190
|
+
|
191
|
+
return nlp
|
192
|
+
except (OSError, ImportError):
|
193
|
+
return None
|
194
|
+
|
195
|
+
|
196
|
+
def _select_spacy_model(languages: list[str] | None, spacy_config: SpacyEntityExtractionConfig) -> str | None:
|
197
|
+
"""Select the best spaCy model based on detected languages."""
|
198
|
+
if not languages:
|
199
|
+
return spacy_config.get_model_for_language("en")
|
200
|
+
|
201
|
+
for lang in languages:
|
202
|
+
model_name = spacy_config.get_model_for_language(lang)
|
203
|
+
if model_name:
|
204
|
+
return model_name
|
205
|
+
|
206
|
+
return spacy_config.get_fallback_model()
|
207
|
+
|
208
|
+
|
209
|
+
def extract_keywords(
|
210
|
+
text: str,
|
211
|
+
keyword_count: int = 10,
|
212
|
+
) -> list[tuple[str, float]]:
|
213
|
+
"""Extract keywords from text using the KeyBERT model.
|
214
|
+
|
215
|
+
Args:
|
216
|
+
text: The input text to extract keywords from.
|
217
|
+
keyword_count: Number of top keywords to return. Defaults to 10.
|
218
|
+
|
219
|
+
Returns:
|
220
|
+
list[tuple[str, float]]: A list of tuples containing keywords and their relevance scores.
|
221
|
+
|
222
|
+
Raises:
|
223
|
+
MissingDependencyError: If `keybert` is not installed.
|
224
|
+
"""
|
225
|
+
try:
|
226
|
+
from keybert import KeyBERT
|
227
|
+
|
228
|
+
kw_model = KeyBERT()
|
229
|
+
keywords = kw_model.extract_keywords(text, top_n=keyword_count)
|
230
|
+
return [(kw, float(score)) for kw, score in keywords]
|
231
|
+
except (RuntimeError, OSError, ValueError):
|
232
|
+
return []
|
233
|
+
except ImportError as e:
|
234
|
+
raise MissingDependencyError.create_for_package(
|
235
|
+
package_name="keybert",
|
236
|
+
dependency_group="entity-extraction",
|
237
|
+
functionality="Keyword Extraction",
|
238
|
+
) from e
|
kreuzberg/_extractors/_base.py
CHANGED
@@ -3,10 +3,12 @@ from __future__ import annotations
|
|
3
3
|
from abc import ABC, abstractmethod
|
4
4
|
from typing import TYPE_CHECKING, ClassVar
|
5
5
|
|
6
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
7
|
+
from kreuzberg._utils._quality import calculate_quality_score, clean_extracted_text
|
8
|
+
|
6
9
|
if TYPE_CHECKING:
|
7
10
|
from pathlib import Path
|
8
11
|
|
9
|
-
from kreuzberg import ExtractionResult
|
10
12
|
from kreuzberg._types import ExtractionConfig
|
11
13
|
|
12
14
|
|
@@ -90,3 +92,39 @@ class Extractor(ABC):
|
|
90
92
|
return mime_type in cls.SUPPORTED_MIME_TYPES or any(
|
91
93
|
mime_type.startswith(supported_type) for supported_type in cls.SUPPORTED_MIME_TYPES
|
92
94
|
)
|
95
|
+
|
96
|
+
def _apply_quality_processing(self, result: ExtractionResult) -> ExtractionResult:
|
97
|
+
"""Apply quality post-processing to extraction result if enabled.
|
98
|
+
|
99
|
+
Args:
|
100
|
+
result: The raw extraction result
|
101
|
+
|
102
|
+
Returns:
|
103
|
+
Enhanced extraction result with quality improvements (if enabled)
|
104
|
+
"""
|
105
|
+
# Only apply quality processing if enabled in config
|
106
|
+
if not self.config.enable_quality_processing:
|
107
|
+
return result
|
108
|
+
|
109
|
+
if not result.content:
|
110
|
+
return result
|
111
|
+
|
112
|
+
# Clean the content
|
113
|
+
cleaned_content = clean_extracted_text(result.content)
|
114
|
+
|
115
|
+
# Calculate quality score
|
116
|
+
quality_score = calculate_quality_score(cleaned_content, dict(result.metadata) if result.metadata else None)
|
117
|
+
|
118
|
+
# Add quality metadata
|
119
|
+
enhanced_metadata = dict(result.metadata) if result.metadata else {}
|
120
|
+
enhanced_metadata["quality_score"] = quality_score
|
121
|
+
|
122
|
+
# Return enhanced result
|
123
|
+
return ExtractionResult(
|
124
|
+
content=cleaned_content,
|
125
|
+
mime_type=result.mime_type,
|
126
|
+
metadata=normalize_metadata(enhanced_metadata),
|
127
|
+
chunks=result.chunks,
|
128
|
+
detected_languages=result.detected_languages,
|
129
|
+
tables=result.tables,
|
130
|
+
)
|
@@ -0,0 +1,149 @@
|
|
1
|
+
from __future__ import annotations
|
2
|
+
|
3
|
+
import re
|
4
|
+
from html import unescape
|
5
|
+
from typing import TYPE_CHECKING, Any, ClassVar
|
6
|
+
|
7
|
+
from anyio import Path as AsyncPath
|
8
|
+
|
9
|
+
from kreuzberg._extractors._base import Extractor
|
10
|
+
from kreuzberg._mime_types import EML_MIME_TYPE, PLAIN_TEXT_MIME_TYPE
|
11
|
+
from kreuzberg._types import ExtractionResult, normalize_metadata
|
12
|
+
from kreuzberg._utils._string import normalize_spaces
|
13
|
+
from kreuzberg._utils._sync import run_sync
|
14
|
+
from kreuzberg.exceptions import MissingDependencyError
|
15
|
+
|
16
|
+
if TYPE_CHECKING:
|
17
|
+
from pathlib import Path
|
18
|
+
|
19
|
+
# Import optional dependencies at module level with proper error handling
|
20
|
+
try:
|
21
|
+
import mailparse
|
22
|
+
except ImportError:
|
23
|
+
mailparse = None
|
24
|
+
|
25
|
+
try:
|
26
|
+
import html2text # type: ignore[import-not-found]
|
27
|
+
except ImportError:
|
28
|
+
html2text = None
|
29
|
+
|
30
|
+
# Compile regex pattern once at module level
|
31
|
+
_HTML_TAG_PATTERN = re.compile(r"<[^>]+>")
|
32
|
+
|
33
|
+
|
34
|
+
class EmailExtractor(Extractor):
|
35
|
+
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = {EML_MIME_TYPE}
|
36
|
+
|
37
|
+
async def extract_bytes_async(self, content: bytes) -> ExtractionResult:
|
38
|
+
return await run_sync(self.extract_bytes_sync, content)
|
39
|
+
|
40
|
+
async def extract_path_async(self, path: Path) -> ExtractionResult:
|
41
|
+
content = await AsyncPath(path).read_bytes()
|
42
|
+
return await self.extract_bytes_async(content)
|
43
|
+
|
44
|
+
def _extract_email_headers(
|
45
|
+
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
46
|
+
) -> None:
|
47
|
+
"""Extract and process email headers."""
|
48
|
+
# Use single dict access where possible to avoid repeated lookups
|
49
|
+
subject = parsed_email.get("subject")
|
50
|
+
if subject:
|
51
|
+
metadata["subject"] = subject
|
52
|
+
text_parts.append(f"Subject: {subject}")
|
53
|
+
|
54
|
+
from_info = parsed_email.get("from")
|
55
|
+
if from_info:
|
56
|
+
from_email = from_info.get("email", "") if isinstance(from_info, dict) else str(from_info)
|
57
|
+
metadata["email_from"] = from_email
|
58
|
+
text_parts.append(f"From: {from_email}")
|
59
|
+
|
60
|
+
to_info = parsed_email.get("to")
|
61
|
+
if to_info:
|
62
|
+
if isinstance(to_info, list) and to_info:
|
63
|
+
to_email = to_info[0].get("email", "") if isinstance(to_info[0], dict) else str(to_info[0])
|
64
|
+
elif isinstance(to_info, dict):
|
65
|
+
to_email = to_info.get("email", "")
|
66
|
+
else:
|
67
|
+
to_email = str(to_info)
|
68
|
+
metadata["email_to"] = to_email
|
69
|
+
text_parts.append(f"To: {to_email}")
|
70
|
+
|
71
|
+
date = parsed_email.get("date")
|
72
|
+
if date:
|
73
|
+
metadata["date"] = date
|
74
|
+
text_parts.append(f"Date: {date}")
|
75
|
+
|
76
|
+
cc = parsed_email.get("cc")
|
77
|
+
if cc:
|
78
|
+
metadata["email_cc"] = cc
|
79
|
+
text_parts.append(f"CC: {cc}")
|
80
|
+
|
81
|
+
bcc = parsed_email.get("bcc")
|
82
|
+
if bcc:
|
83
|
+
metadata["email_bcc"] = bcc
|
84
|
+
text_parts.append(f"BCC: {bcc}")
|
85
|
+
|
86
|
+
def _extract_email_body(self, parsed_email: dict[str, Any], text_parts: list[str]) -> None:
|
87
|
+
"""Extract and process email body content."""
|
88
|
+
text_content = parsed_email.get("text")
|
89
|
+
if text_content:
|
90
|
+
text_parts.append(f"\n{text_content}")
|
91
|
+
return # If we have text, prefer it over HTML
|
92
|
+
|
93
|
+
html_content = parsed_email.get("html")
|
94
|
+
if html_content:
|
95
|
+
if html2text is not None:
|
96
|
+
# Use html2text if available (faster path)
|
97
|
+
h = html2text.HTML2Text()
|
98
|
+
h.ignore_links = True
|
99
|
+
h.ignore_images = True
|
100
|
+
converted_text = h.handle(html_content)
|
101
|
+
text_parts.append(f"\n{converted_text}")
|
102
|
+
else:
|
103
|
+
# Fallback: strip HTML tags and unescape entities
|
104
|
+
clean_html = _HTML_TAG_PATTERN.sub("", html_content)
|
105
|
+
clean_html = unescape(clean_html)
|
106
|
+
text_parts.append(f"\n{clean_html}")
|
107
|
+
|
108
|
+
def _extract_email_attachments(
|
109
|
+
self, parsed_email: dict[str, Any], text_parts: list[str], metadata: dict[str, Any]
|
110
|
+
) -> None:
|
111
|
+
"""Extract and process email attachments info."""
|
112
|
+
if parsed_email.get("attachments"):
|
113
|
+
attachment_names = [att.get("name", "unknown") for att in parsed_email["attachments"]]
|
114
|
+
metadata["attachments"] = attachment_names
|
115
|
+
if attachment_names:
|
116
|
+
text_parts.append(f"\nAttachments: {', '.join(attachment_names)}")
|
117
|
+
|
118
|
+
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
119
|
+
if mailparse is None:
|
120
|
+
msg = "mailparse is required for email extraction. Install with: pip install 'kreuzberg[additional-extensions]'"
|
121
|
+
raise MissingDependencyError(msg)
|
122
|
+
|
123
|
+
try:
|
124
|
+
parsed_email = mailparse.EmailDecode.load(content)
|
125
|
+
text_parts: list[str] = []
|
126
|
+
metadata: dict[str, Any] = {}
|
127
|
+
|
128
|
+
# Extract headers, body, and attachments
|
129
|
+
self._extract_email_headers(parsed_email, text_parts, metadata)
|
130
|
+
self._extract_email_body(parsed_email, text_parts)
|
131
|
+
self._extract_email_attachments(parsed_email, text_parts, metadata)
|
132
|
+
|
133
|
+
# Join efficiently
|
134
|
+
combined_text = "\n".join(text_parts)
|
135
|
+
|
136
|
+
return ExtractionResult(
|
137
|
+
content=normalize_spaces(combined_text),
|
138
|
+
mime_type=PLAIN_TEXT_MIME_TYPE,
|
139
|
+
metadata=normalize_metadata(metadata),
|
140
|
+
chunks=[],
|
141
|
+
)
|
142
|
+
|
143
|
+
except Exception as e:
|
144
|
+
msg = f"Failed to parse email content: {e}"
|
145
|
+
raise RuntimeError(msg) from e
|
146
|
+
|
147
|
+
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
148
|
+
content = path.read_bytes()
|
149
|
+
return self.extract_bytes_sync(content)
|
kreuzberg/_extractors/_html.py
CHANGED
@@ -8,7 +8,7 @@ from anyio import Path as AsyncPath
|
|
8
8
|
from kreuzberg._extractors._base import Extractor
|
9
9
|
from kreuzberg._mime_types import HTML_MIME_TYPE, MARKDOWN_MIME_TYPE
|
10
10
|
from kreuzberg._types import ExtractionResult
|
11
|
-
from kreuzberg._utils._string import
|
11
|
+
from kreuzberg._utils._string import safe_decode
|
12
12
|
from kreuzberg._utils._sync import run_sync
|
13
13
|
|
14
14
|
if TYPE_CHECKING:
|
@@ -26,8 +26,20 @@ class HTMLExtractor(Extractor):
|
|
26
26
|
return await run_sync(self.extract_bytes_sync, content)
|
27
27
|
|
28
28
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
29
|
-
|
30
|
-
|
29
|
+
# Use html-to-markdown with script/nav removal for better quality
|
30
|
+
result = html_to_markdown.convert_to_markdown(
|
31
|
+
safe_decode(content),
|
32
|
+
preprocess_html=True,
|
33
|
+
preprocessing_preset="aggressive",
|
34
|
+
remove_navigation=True,
|
35
|
+
remove_forms=True,
|
36
|
+
)
|
37
|
+
|
38
|
+
# Skip normalize_spaces since quality processing will handle whitespace
|
39
|
+
extraction_result = ExtractionResult(content=result, mime_type=MARKDOWN_MIME_TYPE, metadata={}, chunks=[])
|
40
|
+
|
41
|
+
# Apply quality processing which includes normalization
|
42
|
+
return self._apply_quality_processing(extraction_result)
|
31
43
|
|
32
44
|
def extract_path_sync(self, path: Path) -> ExtractionResult:
|
33
45
|
content = path.read_bytes()
|
kreuzberg/_extractors/_image.py
CHANGED
@@ -1,5 +1,9 @@
|
|
1
1
|
from __future__ import annotations
|
2
2
|
|
3
|
+
import contextlib
|
4
|
+
import os
|
5
|
+
import tempfile
|
6
|
+
from pathlib import Path
|
3
7
|
from typing import TYPE_CHECKING, ClassVar
|
4
8
|
|
5
9
|
from anyio import Path as AsyncPath
|
@@ -7,6 +11,9 @@ from anyio import Path as AsyncPath
|
|
7
11
|
from kreuzberg._extractors._base import Extractor
|
8
12
|
from kreuzberg._mime_types import IMAGE_MIME_TYPES
|
9
13
|
from kreuzberg._ocr import get_ocr_backend
|
14
|
+
from kreuzberg._ocr._easyocr import EasyOCRConfig
|
15
|
+
from kreuzberg._ocr._paddleocr import PaddleOCRConfig
|
16
|
+
from kreuzberg._ocr._tesseract import TesseractConfig
|
10
17
|
from kreuzberg._utils._tmp import create_temp_file
|
11
18
|
from kreuzberg.exceptions import ValidationError
|
12
19
|
|
@@ -15,9 +22,6 @@ if TYPE_CHECKING: # pragma: no cover
|
|
15
22
|
|
16
23
|
from kreuzberg._types import ExtractionResult
|
17
24
|
|
18
|
-
import contextlib
|
19
|
-
from pathlib import Path
|
20
|
-
|
21
25
|
|
22
26
|
class ImageExtractor(Extractor):
|
23
27
|
SUPPORTED_MIME_TYPES: ClassVar[set[str]] = IMAGE_MIME_TYPES
|
@@ -56,13 +60,11 @@ class ImageExtractor(Extractor):
|
|
56
60
|
if self.config.ocr_backend is None:
|
57
61
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
58
62
|
|
59
|
-
|
63
|
+
result = await get_ocr_backend(self.config.ocr_backend).process_file(path, **self.config.get_config_dict())
|
64
|
+
return self._apply_quality_processing(result)
|
60
65
|
|
61
66
|
def extract_bytes_sync(self, content: bytes) -> ExtractionResult:
|
62
67
|
"""Pure sync implementation of extract_bytes."""
|
63
|
-
import os
|
64
|
-
import tempfile
|
65
|
-
|
66
68
|
extension = self._get_extension_from_mime_type(self.mime_type)
|
67
69
|
fd, temp_path = tempfile.mkstemp(suffix=f".{extension}")
|
68
70
|
|
@@ -80,23 +82,26 @@ class ImageExtractor(Extractor):
|
|
80
82
|
if self.config.ocr_backend is None:
|
81
83
|
raise ValidationError("ocr_backend is None, cannot perform OCR")
|
82
84
|
|
83
|
-
|
84
|
-
from kreuzberg._types import ExtractionResult
|
85
|
+
backend = get_ocr_backend(self.config.ocr_backend)
|
85
86
|
|
86
87
|
if self.config.ocr_backend == "tesseract":
|
87
|
-
|
88
|
-
|
89
|
-
|
90
|
-
|
91
|
-
|
92
|
-
|
93
|
-
|
94
|
-
|
95
|
-
|
96
|
-
|
97
|
-
|
98
|
-
|
99
|
-
|
88
|
+
config = (
|
89
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, TesseractConfig) else TesseractConfig()
|
90
|
+
)
|
91
|
+
result = backend.process_file_sync(path, **config.__dict__)
|
92
|
+
elif self.config.ocr_backend == "paddleocr":
|
93
|
+
paddle_config = (
|
94
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, PaddleOCRConfig) else PaddleOCRConfig()
|
95
|
+
)
|
96
|
+
result = backend.process_file_sync(path, **paddle_config.__dict__)
|
97
|
+
elif self.config.ocr_backend == "easyocr":
|
98
|
+
easy_config = (
|
99
|
+
self.config.ocr_config if isinstance(self.config.ocr_config, EasyOCRConfig) else EasyOCRConfig()
|
100
|
+
)
|
101
|
+
result = backend.process_file_sync(path, **easy_config.__dict__)
|
102
|
+
else:
|
103
|
+
raise NotImplementedError(f"Sync OCR not implemented for {self.config.ocr_backend}")
|
104
|
+
return self._apply_quality_processing(result)
|
100
105
|
|
101
106
|
def _get_extension_from_mime_type(self, mime_type: str) -> str:
|
102
107
|
if mime_type in self.IMAGE_MIME_TYPE_EXT_MAP:
|